In this document we test the benefits of using multi-time-steps Markov transition matrix as inputs to the KL divergence optimization solved by t-SNE. When constructing a proximity matrix for t-SNE, one might consider keeping bandwidths small for the Gaussian kernel, but then taking higher powers, $P^t$, of the corresponding conditional probability matrix, $P = P_{j|i}$. Varying powers of $P$ allow for studing the geometry of the data at different scales.
import sys
import os
#limit the number of threds numpy/scipy are using
os.environ["OMP_NUM_THREADS"] = "4"
nthreads = 4
import time
import pickle
import numpy as npd
import multiprocessing as mp
# Custom pyscripts
maindir = '/home/lanhuong/Projects/ManifoldLearning/DiffusionTSNE'
curdir = os.getcwd()
#maindir = '../'
os.chdir(maindir)
from diffusion_tsne import diffusion_tsne
from plotting import *
from generate_data import *
from utils import *
from metrics import *
# plotting
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['figure.titlesize'] = 30
mpl.rcParams['axes.titlesize'] = 30
mpl.rcParams['axes.labelsize'] = 20
mpl.rcParams['xtick.labelsize'] = 16
mpl.rcParams['ytick.labelsize'] = 16
mpl.rc('text', usetex=True)
# matplotlib settings for Jupyter notebooks only
%matplotlib inline
%load_ext autoreload
%autoreload 2
#np.show_config()
# uniformly sampled
np.random.seed(123)
n_points = 3000; noise = 0.5
X, color, y = swiss_roll(n_points, noise)
GX = distance2(np.vstack((color, y)).T)
X.shape
Xtruth = np.array([color, y]).T
Xtruth.shape
p0 = plot2D(np.vstack((color, y)).T, label=color, figsize=(10, 2))
plt.axes().set_aspect(1.0)
p = plot3D(X, label=color)
import sklearn.decomposition # PCA
start = time.time()
pca_operator = sklearn.decomposition.PCA(n_components=2)
Y_pca = pca_operator.fit_transform(X)
end = time.time()
print("Embedded PCA in {:.2f} seconds.".format(end-start))
Y_pca.shape
plot2D(Y_pca, label = color,figsize=(6, 5))
plt.axes().set_aspect(1.0)
score_pca = coranking_quality(X, Y_pca, 'QNK', 1, 250)
rho_pca = geo_rho(X, Y_pca, GX = GX)
plot2D(Y_pca, label = rho_pca,figsize=(10, 8))
print(np.mean(rho_pca))
print(np.var(rho_pca))
import phate
start = time.time()
phate_operator = phate.PHATE(n_jobs=-2)
Y_phate = phate_operator.fit_transform(X)
end = time.time()
print("Embedded with PHATE in {:.2f} seconds.".format(end-start))
plot2D(Y_phate, label = color,figsize=(6, 5))
plt.axes().set_aspect(1.0)
score_phate = coranking_quality(X, Y_phate, 'QNK', 1, 250)
rho_phate = geo_rho(X, Y_phate, GX = GX)
plot2D(Y_phate, label = rho_phate,figsize=(10, 8))
plt.axes().set_aspect(1.0)
print(np.mean(rho_phate))
print(np.var(rho_phate))
from sklearn.manifold import Isomap
n_neighbors = 12
n_components = 2
t0 = time.time()
Y_iso = Isomap(n_neighbors, n_components).fit_transform(X)
t1 = time.time()
print("Isomap: %.2g sec" % (t1 - t0))
plot2D(Y_iso, label = color, figsize=(10, 2))
plt.axes().set_aspect(1.0)
#GX = pairwise_distances(np.vstack((color, y)).T)
#GX = pairwise_distances(y.reshape((X.shape[0], 1)))
#Y_iso0 = Y_iso[:, 1].reshape((X.shape[0], 1))
rho_iso = geo_rho(X, Y_iso, GX = GX)
plot2D(Y_iso, label = rho_iso, figsize=(10, 8))
plt.axes().set_aspect(1.0)
score_iso = coranking_quality(Xtruth, Y_iso, "QNK", 1, 250)
fig, ax = plt.subplots()
line1, = ax.plot(range(len(score_iso)), score_iso,
linewidth=2,label='Isomap')
line1, = ax.plot(range(len(score_iso)), score_phate,
linewidth=2,label='PHATE')
ax.legend(loc='lower right')
plt.show()
print(np.mean(rho_phate))
np.mean(rho_iso)
from sklearn.manifold import TSNE
start = time.time()
Y_tsne_perp30 = TSNE(n_components=2, perplexity=30).fit_transform(X)
end = time.time()
print('t-SNE embedding in %f sec' %(end-start))
plot2D(Y_tsne_perp30, label=color, s=25, figsize=(8,7))
plt.axis('equal')
from sklearn.manifold import TSNE
start = time.time()
Y_tsne_perp300 = TSNE(n_components=2, perplexity=300).fit_transform(X)
end = time.time()
print('t-SNE embedding in %f sec' %(end-start))
plot2D(Y_tsne_perp300, label=color, s=25, figsize=(8,7))
plt.axis('equal')
start = time.time()
Y_difftsne_p25t10 = diffusion_tsne(
X, perplexity=30, time_steps=10,
seed=12345, nthreads=nthreads,
load_affinities="save", save_files=True, affinities_dir = "examples/SwissRoll-noisy/"
)
end = time.time()
print('Diffusion t-SNE embedding in %f sec' %(end-start))
plot2D(Y_difftsne_p25t10, label=color, s=25, figsize=(8,7))
plt.axis('equal')
start = time.time()
start = time.time()
Y_difftsne_p25t10 = diffusion_tsne(
X, perplexity=30, time_steps=30, seed=475, nthreads=nthreads,
load_affinities="save", save_files=True, affinities_dir = "examples/SwissRoll-noisy/"
)
end = time.time()
print('Diffusion t-SNE embedding in %f sec' %(end-start))
plot2D(Y_difftsne_p25t10, label=color, s=25, figsize=(8,7))
plt.axis('equal')
start = time.time()
Y_difftsne_scale = diffusion_tsne(
X, perplexity=30, time_steps=30,
seed=1, nthreads=nthreads, scale_probs = True,
load_affinities="save",
save_files=True, affinities_dir = "examples/SwissRoll-noisy/"
)
end = time.time()
print('Diffusion t-SNE embedding in %f sec' %(end-start))
plot2D(Y_difftsne_scale, label=color, s=25, figsize=(8,7))
plt.axis('equal')
os.getcwd()
np.log( [10, 25, 50, 100, 250, 500])
def standard_tsne(X, perp, nthreads, seed):
it_dir = 'examples/SwissRoll-noisy/standard_tsne/affinities_perp'+ \
str(perp) + '_it' + str(seed) + "/"
start = time.time()
Y = diffusion_tsne(
X, perplexity=perp, seed=seed, nthreads=nthreads,
load_affinities="save", save_files=True, affinities_dir = it_dir)
end = time.time()
res = {'embedding' : Y, 'time' : end - start, 'seed' : seed,
'perp' : perp, 'method' : 'standard_tsne'}
return res
results = {}
ncopies = 5
perps = [10, 25, 50, 100, 250, 300, 500]
NCORES = 40
pool = mp.Pool(processes = min(len(perps)*ncopies*nthreads, NCORES, mp.cpu_count()))
for perp in perps:
print('Perplexity: %d' %(perp))
for i in range(ncopies):
seed = np.random.randint(1, 1e5, size = 1)[0]
trial = 'perp' + str(perp) + '_it' + str(i)
results[trial] = pool.apply_async(standard_tsne, args = (X, perp, nthreads, seed))
pool.close()
pool.join()
vanilla_tsne_res = {name : result.get() for name, result in results.items()}
filename = 'examples/SwissRoll-noisy/standard_tsne/embeddings.pkl'
with open(filename, 'wb') as handle:
pickle.dump(vanilla_tsne_res, handle, protocol=pickle.HIGHEST_PROTOCOL)
perps = [10, 25, 50, 100, 250]
embds = [vanilla_tsne_res['perp' + str(perp) + '_it2']['embedding'] for perp in perps]
names = [r"$\eta = %d$" %perp for perp in perps]
plot_embdeddings(embds, color=color,
name_lst=names, figsize=(15, 2.5),
s = 10, edgecolor='black', linewidth=0.1, fontsize = 20)
perps = [10, 25, 50, 100, 250]
vanilla_tsne_embd, vanilla_tsne_time = get_res(
vanilla_tsne_res, color, perps, figsize = (15, 15))
plt.subplots_adjust(hspace=0.4, wspace=0.2)
perps = [10, 25, 50, 100, 250, 300, 500]
vanilla_tsne_spearman_mean = {}
vanilla_tsne_spearman_var = {}
keys = list(vanilla_tsne_embd.keys())
for perp in perps:
cur_keys = [key for key in keys if 'perp' + str(perp) + '_' in key]
pool = mp.Pool(processes = min(len(cur_keys), mp.cpu_count()))
results = {}
for key in cur_keys:
Y = vanilla_tsne_embd[key]
results[key] = pool.apply_async(geo_rho, args = (X, Y, GX))
pool.close()
pool.join()
rho_mean_lst = [np.mean(res.get()) for name, res in results.items()] # over data points
rho_var_lst = [np.var(res.get()) for name, res in results.items()]
vanilla_tsne_spearman_mean['perp' + str(perp)] = rho_mean_lst
vanilla_tsne_spearman_var['perp' + str(perp)] = rho_var_lst
# import pickle
# filename = 'examples/SwissRoll-noisy/standard_tsne/swiss_roll_vanilla_tsne.pkl'
# outfile = open(filename,'wb')
# obj_dict = {'X': X, 'color':color, 'y': y,
# 'vanilla_tsne_res':vanilla_tsne_res,
# 'vanilla_tsne_spearman_mean':vanilla_tsne_spearman_mean,
# 'vanilla_tsne_spearman_var':vanilla_tsne_spearman_var
# }
# pickle.dump(obj_dict, outfile)
# outfile.close()
import pickle
filename = 'examples/SwissRoll-noisy/standard_tsne/swiss_roll_vanilla_tsne.pkl'
infile = open(filename,'rb')
new_dict = pickle.load(infile)
infile.close()
locals().update(new_dict)
import pandas as pd
perps = [10, 25, 50, 100, 250, 500]
df = pd.DataFrame(vanilla_tsne_spearman_mean,
columns = ['perp' + str(perp) for perp in perps])
df.columns = [str(perp) for perp in perps]
plt.figure(figsize=(8, 6))
df.boxplot()
plt.xlabel("Perplexity")
plt.ylabel("Embedding Quality")
plt.rcParams.update({'font.size': 20})
print(df.mean())
print(np.sqrt(df.var()))
perps = [10, 25, 50, 100, 250, 500]
dfvar = pd.DataFrame(vanilla_tsne_spearman_var,
columns = ['perp' + str(perp) for perp in perps])
dfvar.columns = [str(perp) for perp in perps]
plt.figure(figsize=(8, 6))
dfvar.boxplot()
plt.xlabel("Perplexity")
plt.ylabel("Embedding Quality")
plt.rcParams.update({'font.size': 20})
vanilla_tsne_corank.keys()
# Coranking matrix
perps = [10, 25, 50, 100, 250, 300, 500]
vanilla_tsne_corank = {}
keys = list(vanilla_tsne_embd.keys())
for perp in perps:
cur_keys = [key for key in keys if 'perp' + str(perp) + '_' in key]
pool = mp.Pool(processes = min(len(cur_keys), mp.cpu_count()))
results = {}
for key in cur_keys:
Y = vanilla_tsne_embd[key]
results[key] = pool.apply_async(coranking_quality, args = (Xtruth, Y, 'QNK', 1, 250))
pool.close()
pool.join()
crank = { name: res.get() for name, res in results.items()}
vanilla_tsne_corank['perp' + str(perp)] = crank
fig, ax = plt.subplots(figsize = (8, 6))
for perp in perps:
pname = 'perp' + str(perp)
perp_corank = vanilla_tsne_corank[pname]
perp_corank = [ val for k, val in perp_corank.items()]
perp_corank = np.array(perp_corank)
mean_perp_corank = np.mean(perp_corank, axis = 0).ravel()
n = mean_perp_corank.shape[0]
line, = ax.plot(range(n), mean_perp_corank, linewidth=2, label=str(perp))
ax.legend(loc=(250, 0), title='Perplexity', bbox_to_anchor= (1.05, 0.3))
plt.show()
import pickle
filename = 'examples/SwissRoll-noisy/standard_tsne/swiss_roll_vanilla_tsne.pkl'
outfile = open(filename,'wb')
obj_dict = {'X': X, 'color':color, 'y': y,
'vanilla_tsne_res':vanilla_tsne_res,
'vanilla_tsne_spearman_mean':vanilla_tsne_spearman_mean,
'vanilla_tsne_spearman_var':vanilla_tsne_spearman_var,
'vanilla_tsne_corank':vanilla_tsne_corank
}
pickle.dump(obj_dict, outfile)
outfile.close()
def difftsne(X, perp, tstep, nthreads, seed):
it_dir = 'examples/SwissRoll-noisy/diffusion_tsne/affinities_perp'+\
str(perp) + '_tstep' + str(tstep) + '_it' + str(seed) + "/"
start = time.time()
Y = diffusion_tsne(
X, seed=seed, time_steps = tstep, perplexity = perp,
scale_probs = False, nthreads=nthreads,
load_affinities="save", save_files=True, affinities_dir = it_dir)
end = time.time()
res = {'embedding' : Y, 'perp' : perp, 'tstep': tstep,
'method' : 'diffusion_fitsne', 'seed' : seed,
'time' : end - start}
return res
ncopies = 5; NCORES = 40
perps = [10, 20, 25, 30, 50]
tsteps = [1, 5, 10, 20, 50, 100]
nComb = len(tsteps) * ncopies
difftsne_res_dict = {}
for perp in perps:
pool = mp.Pool(processes = min(nComb, int(NCORES/nthreads), int(mp.cpu_count()/nthreads)))
results = {}
for tstep in tsteps:
print('Perplexity: %d, t-step %d' %(perp, tstep))
for i in range(ncopies):
seed = np.random.randint(1, 1e5, size = 1)[0]
trial = 'perp' + str(perp) + '_tstep' + str(tstep) +'_it' + str(i)
try:
results[trial] = pool.apply_async(
difftsne, args = (X, perp, tstep, nthreads, seed))
except:
results[trial] = {'embedding' : -1, 'time': -1}
pool.close()
pool.join()
res = {name : result.get() for name, result in results.items()}
difftsne_res_dict['perp' + str(perp)] = res
difftsne_res = difftsne_res_dict['perp' + str(perps[0])]
for i in range(1, len(perps)):
difftsne_res.update(difftsne_res_dict['perp' + str(perps[i])])
tsteps = [1, 5, 10, 20, 50, 100]
embds = [difftsne_res['perp25_tstep' + str(tstep) + '_it0']['embedding'] for tstep in tsteps]
names = [r"$\eta = 25, t = %d$" %tstep for tstep in tsteps]
plot_embdeddings(embds, color=color,
name_lst=names, figsize=(16, 2.5), ncol = 6,
s = 10, edgecolor='black', linewidth=0.1, fontsize = 20)
tsteps = [5, 10, 20, 50, 100]
embds = [difftsne_res['perp25_tstep' + str(tstep) + '_it0']['embedding'] for tstep in tsteps]
names = [r"$\eta = 25, t = %d$" %tstep for tstep in tsteps]
plot_embdeddings(embds, color=color,
name_lst=names, figsize=(16, 2.5), ncol = 6,
s = 10, edgecolor='black', linewidth=0.1, fontsize = 20)
tsteps = [1, 5, 10, 20, 50, 100]
embds = [difftsne_res['perp20_tstep' + str(tstep) + '_it0']['embedding'] for tstep in tsteps]
names = [r"$\eta = 20, t = %d$" %tstep for tstep in tsteps]
plot_embdeddings(embds, color=color,
name_lst=names, figsize=(16, 2.5), ncol = 6,
s = 10, edgecolor='black', linewidth=0.1, fontsize = 20)
tsteps = [ 5, 10, 20, 50, 100]
embds = [difftsne_res['perp20_tstep' + str(tstep) + '_it0']['embedding'] for tstep in tsteps]
names = [r"$\eta = 20, t = %d$" %tstep for tstep in tsteps]
plot_embdeddings(embds, color=color,
name_lst=names, figsize=(16, 2.5), ncol = 6,
s = 10, edgecolor='black', linewidth=0.1, fontsize = 20)
perps = [10, 20, 25, 30, 50]
tsteps = [1, 5, 10, 20, 50]
difftsne_embd, difftsne_time = get_res(
difftsne_res, color, perps,
tsteps, figsize = (15, 15))
plt.subplots_adjust(hspace=0.4, wspace = 0.2)
# Performance
perps = [10, 20, 25, 30, 50]
tsteps = [1, 5, 10, 20, 50, 100]
keys = list(difftsne_embd.keys())
diff_tsne_spearman_mean = {}
diff_tsne_spearman_var = {}
keys = list(difftsne_embd.keys())
for perp in perps:
for tstep in tsteps:
comb = 'perp' + str(perp) + '_' + 'tstep' + str(tstep) + "_"
cur_keys = [key for key in keys if comb in key]
pool = mp.Pool(processes = min(len(cur_keys), mp.cpu_count()))
results = {}
for key in cur_keys:
Y = difftsne_embd[key]
results[key] = pool.apply_async(geo_rho, args = (X, Y, GX))
pool.close()
pool.join()
rho_mean_lst = [np.mean(res.get()) for name, res in results.items()] # over data points
rho_var_lst = [np.var(res.get()) for name, res in results.items()]
diff_tsne_spearman_mean[comb] = rho_mean_lst
diff_tsne_spearman_var[comb] = rho_var_lst
# Coranking matrix
perps = [10, 20, 25, 30, 50]
tsteps = [1, 5, 10, 20, 50, 100]
diff_tsne_corank = {}
keys = list(difftsne_embd.keys())
for perp in perps:
for tstep in tsteps:
comb = 'perp' + str(perp) + '_' + 'tstep' + str(tstep) + "_"
cur_keys = [key for key in keys if comb in key]
pool = mp.Pool(processes = min(len(cur_keys), mp.cpu_count()))
results = {}
for key in cur_keys:
Y = difftsne_embd[key]
results[key] = pool.apply_async(coranking_quality, args = (Xtruth, Y, 'QNK', 1, 250))
pool.close()
pool.join()
crank = { name: res.get() for name, res in results.items()}
diff_tsne_corank[comb] = crank
fig, ax = plt.subplots(figsize = (8, 6))
for perp in perps:
for tstep in tsteps:
comb = 'perp' + str(perp) + '_' + 'tstep' + str(tstep) + "_"
comb_corank = diff_tsne_corank[comb]
comb_corank = [ val for k, val in comb_corank.items()]
comb_corank = np.array(comb_corank)
mean_comb_corank = np.mean(comb_corank, axis = 0).ravel()
n = mean_comb_corank.shape[0]
line, = ax.plot(range(n), mean_comb_corank, linewidth=2,
label= 'perp: ' + str(perp) + '; ' + 'tstep: ' + str(tstep))
ax.legend(loc=(250, 0), title='Combination', bbox_to_anchor= (1.05, 0.3))
plt.show()
# import pickle
# filename = 'examples/SwissRoll-noisy/diffusion_tsne/swiss_roll_diff_tsne.pkl'
# outfile = open(filename,'wb')
# obj_dict = {'X': X, 'color':color, 'y': y,
# 'difftsne_res':difftsne_res,
# 'diff_tsne_spearman_mean':diff_tsne_spearman_mean,
# 'diff_tsne_spearman_var':diff_tsne_spearman_var,
# 'diff_tsne_corank':diff_tsne_corank
# }
# pickle.dump(obj_dict, outfile)
# outfile.close()
import pickle
filename = 'examples/SwissRoll-noisy/diffusion_tsne/swiss_roll_diff_tsne.pkl'
infile = open(filename,'rb')
new_dict = pickle.load(infile)
infile.close()
locals().update(new_dict)
## import pandas as pd
keys = diff_tsne_spearman_mean.keys()
cols = [key for key in keys if 'perp10_' in key]
df = pd.DataFrame(diff_tsne_spearman_mean, columns = cols)
df.columns = tsteps
plt.figure(figsize=(8, 6))
df.boxplot()
plt.xlabel("Time Steps")
plt.ylabel("Embedding Quality")
plt.rcParams.update({'font.size': 20})
## import pandas as pd
keys = diff_tsne_spearman_mean.keys()
cols = [key for key in keys if 'perp20_' in key]
df = pd.DataFrame(diff_tsne_spearman_mean, columns = cols)
df.columns = tsteps
plt.figure(figsize=(8, 6))
df.boxplot()
plt.xlabel("Time Steps")
plt.ylabel("Embedding Quality")
plt.rcParams.update({'font.size': 20})
## import pandas as pd
keys = diff_tsne_spearman_mean.keys()
cols = [key for key in keys if 'perp25_' in key]
df = pd.DataFrame(diff_tsne_spearman_mean, columns = cols)
df.columns = tsteps
plt.figure(figsize=(8, 6))
df.boxplot()
plt.xlabel("Time Steps")
plt.ylabel("Embedding Quality")
plt.rcParams.update({'font.size': 16})
print(df.mean())
print(np.sqrt(df.var()))
## import pandas as pd
keys = diff_tsne_spearman_mean.keys()
cols = [key for key in keys if 'perp30_' in key]
df = pd.DataFrame(diff_tsne_spearman_mean, columns = cols)
df.columns = tsteps
plt.figure(figsize=(8, 6))
df.boxplot()
plt.xlabel("Time Steps")
plt.ylabel("Embedding Quality")
plt.rcParams.update({'font.size': 16})
## import pandas as pd
keys = diff_tsne_spearman_mean.keys()
cols = [key for key in keys if 'perp50_' in key]
df = pd.DataFrame(diff_tsne_spearman_mean, columns = cols)
df.columns = tsteps
plt.figure(figsize=(8, 6))
df.boxplot()
plt.xlabel("Time Steps")
plt.ylabel("Embedding Quality")
plt.rcParams.update({'font.size': 16})
def scaled_difftsne(X, perp, tstep, nthreads, seed):
it_dir = 'examples/SwissRoll-noisy/scaled_diffusion_tsne/affinities_perp'+\
str(perp) + '_tstep' + str(tstep) + '_it' + str(seed) + "/"
start = time.time()
Y = diffusion_tsne(
X, seed=seed, time_steps = tstep, perplexity = perp,
scale_probs = True, nthreads=nthreads,
load_affinities="save", save_files=True, affinities_dir = it_dir)
end = time.time()
res = {'embedding' : Y, 'perp' : perp, 'tstep': tstep,
'method' : 'diffusion_fitsne', 'seed' : seed,
'time' : end - start}
return res
ncopies = 5; NCORES = 40
perps = [10, 20, 25, 30, 50]
tsteps = [1, 5, 10, 20, 50, 100]
nComb = len(tsteps) * ncopies
scaled_difftsne_res_dict = {}
for perp in perps:
pool = mp.Pool(processes = min(nComb, int(NCORES/nthreads), int(mp.cpu_count()/nthreads)))
results = {}
for tstep in tsteps:
print('Perplexity: %d, t-step %d' %(perp, tstep))
for i in range(ncopies):
seed = np.random.randint(1, 1e5, size = 1)[0]
trial = 'perp' + str(perp) + '_tstep' + str(tstep) +'_it' + str(i)
try:
results[trial] = pool.apply_async(
scaled_difftsne, args = (X, perp, tstep, nthreads, seed))
except:
results[trial] = {'embedding' : -1, 'time': -1}
pool.close()
pool.join()
res = {name : result.get() for name, result in results.items()}
scaled_difftsne_res_dict['perp' + str(perp)] = res
scaled_difftsne_res = scaled_difftsne_res_dict['perp' + str(perps[0])]
for i in range(1, len(perps)):
scaled_difftsne_res.update(scaled_difftsne_res_dict['perp' + str(perps[i])])
tsteps = [1, 5, 10, 20, 50, 100]
embds = [scaled_difftsne_res['perp20_tstep' + str(tstep) + '_it0']['embedding']
for tstep in tsteps]
names = [r"$\eta = 20, t = %d$" %tstep for tstep in tsteps]
plot_embdeddings(embds, color=color,
name_lst=names, figsize=(16, 2.5), ncol = 6,
s = 10, edgecolor='black', linewidth=0.1, fontsize = 20)
tsteps = [5, 10, 20, 50, 100]
embds = [scaled_difftsne_res['perp20_tstep' + str(tstep) + '_it0']['embedding']
for tstep in tsteps]
names = [r"$\eta = 20, t = %d$" %tstep for tstep in tsteps]
plot_embdeddings(embds, color=color,
name_lst=names, figsize=(16, 2.5), ncol = 6,
s = 10, edgecolor='black', linewidth=0.1, fontsize = 20)
tsteps = [1, 5, 10, 20, 50, 100]
embds = [scaled_difftsne_res['perp25_tstep' + str(tstep) + '_it0']['embedding']
for tstep in tsteps]
names = [r"$\eta = 25, t = %d$" %tstep for tstep in tsteps]
plot_embdeddings(embds, color=color,
name_lst=names, figsize=(16, 2.5), ncol = 6,
s = 10, edgecolor='black', linewidth=0.1, fontsize = 20)
tsteps = [5, 10, 20, 50, 100]
embds = [scaled_difftsne_res['perp25_tstep' + str(tstep) + '_it0']['embedding']
for tstep in tsteps]
names = [r"$\eta = 25, t = %d$" %tstep for tstep in tsteps]
plot_embdeddings(embds, color=color,
name_lst=names, figsize=(16, 2.5), ncol = 6,
s = 10, edgecolor='black', linewidth=0.1, fontsize = 20)
perps = [10, 20, 25, 30, 50]
tsteps = [1, 5, 10, 20, 50]
scaled_difftsne_embd, scaled_difftsne_time = get_res(
scaled_difftsne_res, color, perps,
tsteps, figsize = (15, 15))
plt.subplots_adjust(hspace=0.4, wspace = 0.2)
# Performance
perps = [10, 20, 25, 30, 50]
tsteps = [1, 5, 10, 20, 50, 100]
keys = list(scaled_difftsne_embd.keys())
scaled_diff_tsne_spearman_mean = {}
scaled_diff_tsne_spearman_var = {}
keys = list(scaled_difftsne_embd.keys())
for perp in perps:
for tstep in tsteps:
comb = 'perp' + str(perp) + '_' + 'tstep' + str(tstep) + "_"
cur_keys = [key for key in keys if comb in key]
pool = mp.Pool(processes = min(len(cur_keys), mp.cpu_count()))
results = {}
for key in cur_keys:
Y = scaled_difftsne_embd[key]
results[key] = pool.apply_async(geo_rho, args = (X, Y, GX))
pool.close()
pool.join()
rho_mean_lst = [np.mean(res.get()) for name, res in results.items()] # over data points
rho_var_lst = [np.var(res.get()) for name, res in results.items()]
scaled_diff_tsne_spearman_mean[comb] = rho_mean_lst
scaled_diff_tsne_spearman_var[comb] = rho_var_lst
# Coranking matrix
perps = [10, 20, 25, 30, 50]
tsteps = [1, 5, 10, 20, 50, 100]
scaled_diff_tsne_corank = {}
keys = list(scaled_difftsne_embd.keys())
for perp in perps:
for tstep in tsteps:
comb = 'perp' + str(perp) + '_' + 'tstep' + str(tstep) + "_"
cur_keys = [key for key in keys if comb in key]
pool = mp.Pool(processes = min(len(cur_keys), mp.cpu_count()))
results = {}
for key in cur_keys:
Y = scaled_difftsne_embd[key]
results[key] = pool.apply_async(coranking_quality,
args = (Xtruth, Y, 'QNK', 1, 250))
pool.close()
pool.join()
crank = { name: res.get() for name, res in results.items()}
scaled_diff_tsne_corank[comb] = crank
fig, ax = plt.subplots(figsize = (8, 6))
for perp in perps:
for tstep in tsteps:
comb = 'perp' + str(perp) + '_' + 'tstep' + str(tstep) + "_"
comb_corank = scaled_diff_tsne_corank[comb]
comb_corank = [ val for k, val in comb_corank.items()]
comb_corank = np.array(comb_corank)
mean_comb_corank = np.mean(comb_corank, axis = 0).ravel()
n = mean_comb_corank.shape[0]
line, = ax.plot(range(n), mean_comb_corank, linewidth=2,
label= 'perp: ' + str(perp) + '; ' + 'tstep: ' + str(tstep))
ax.legend(loc=(250, 0), title='Combination', bbox_to_anchor= (1.05, 0.3))
plt.show()
# import pickle
# filename = 'examples/SwissRoll-noisy/scaled_diffusion_tsne/swiss_roll_scaled_diff_tsne.pkl'
# outfile = open(filename,'wb')
# obj_dict = {'X': X, 'color':color, 'y': y,
# 'scaled_difftsne_res':scaled_difftsne_res,
# 'scaled_diff_tsne_spearman_mean':scaled_diff_tsne_spearman_mean,
# 'scaled_diff_tsne_spearman_var':scaled_diff_tsne_spearman_var,
# 'scaled_diff_tsne_corank':scaled_diff_tsne_corank
# }
# pickle.dump(obj_dict, outfile)
# outfile.close()
import pickle
filename = 'examples/SwissRoll-noisy/scaled_diffusion_tsne/swiss_roll_scaled_diff_tsne.pkl'
infile = open(filename,'rb')
new_dict = pickle.load(infile)
infile.close()
locals().update(new_dict)
## import pandas as pd
keys = diff_tsne_spearman_mean.keys()
cols = [key for key in keys if 'perp10_' in key]
df = pd.DataFrame(scaled_diff_tsne_spearman_mean, columns = cols)
df.columns = tsteps
plt.figure(figsize=(8, 6))
df.boxplot()
plt.xlabel("Time Steps")
plt.ylabel("Embedding Quality")
plt.rcParams.update({'font.size': 16})
## import pandas as pd
keys = diff_tsne_spearman_mean.keys()
cols = [key for key in keys if 'perp20_' in key]
df = pd.DataFrame(scaled_diff_tsne_spearman_mean, columns = cols)
df.columns = tsteps
plt.figure(figsize=(8, 6))
df.boxplot()
plt.xlabel("Time Steps")
plt.ylabel("Embedding Quality")
plt.rcParams.update({'font.size': 16})
## import pandas as pd
keys = diff_tsne_spearman_mean.keys()
cols = [key for key in keys if 'perp25_' in key]
df = pd.DataFrame(scaled_diff_tsne_spearman_mean, columns = cols)
df.columns = tsteps
plt.figure(figsize=(8, 6))
df.boxplot()
plt.xlabel("Time Steps")
plt.ylabel("Embedding Quality")
plt.rcParams.update({'font.size': 16})
## import pandas as pd
keys = diff_tsne_spearman_mean.keys()
cols = [key for key in keys if 'perp30_' in key]
df = pd.DataFrame(scaled_diff_tsne_spearman_mean, columns = cols)
df.columns = tsteps
plt.figure(figsize=(8, 6))
df.boxplot()
plt.xlabel("Time Steps")
plt.ylabel("Embedding Quality")
plt.rcParams.update({'font.size': 16})
## import pandas as pd
keys = diff_tsne_spearman_mean.keys()
cols = [key for key in keys if 'perp50_' in key]
df = pd.DataFrame(scaled_diff_tsne_spearman_mean, columns = cols)
df.columns = tsteps
plt.figure(figsize=(8, 6))
df.boxplot()
plt.xlabel("Time Steps")
plt.ylabel("Embedding Quality")
plt.rcParams.update({'font.size': 16})
import umap
# in our implementation of FIt-SNE we use entropy = np.log(perplexity) whereas in
# umap one has entropy = np.log2(n_neigh) so we need to adjust:
# np.log(100) = np.log2(n_neigh) ==> n_neigh = 2**np.log(100)
# n_neigh [10-15] ==> perplexity [30-50]
def run_umap(X, nthreads, seed):
start = time.time()
Y = umap.UMAP(random_state = seed, transform_seed = seed).fit_transform(X)
end = time.time()
res = {'embedding' : Y, 'method' : 'umap', 'seed' : seed,
'time' : end - start}
return res
ncopies = 5
pool = mp.Pool(processes = min(ncopies, int(NCORES/nthreads), int(mp.cpu_count()/nthreads)))
results = {}
for i in range(ncopies):
seed = np.random.randint(1, time.time(), size = 1)[0]
trial = 'it' + str(i)
try:
results[trial] = pool.apply_async(
run_umap, args = (X, nthreads, seed))
except:
results[trial] = {'embedding' : -1, 'time': -1}
pool.close()
pool.join()
umap_res = {name : result.get() for name, result in results.items()}
umap_embd_lst = [umap_res[key]['embedding'] for key in umap_res]
plot_embdeddings(umap_embd_lst, color, figsize=(15, 2.5),
s = 10, edgecolor='black', linewidth=0.1)
plot2D(umap_embd_lst[0], label=color, s=25, figsize=(6, 5))
plt.axes().set_aspect(1.0)
# Coranking matrix
umap_corank = {}
pool = mp.Pool(processes = min(len(cur_keys), mp.cpu_count()))
for key in umap_res.keys():
Y = umap_res[key]['embedding']
umap_corank[key] = pool.apply_async(coranking_quality, args = (Xtruth, Y, 'QNK', 1, 250))
pool.close()
pool.join()
umap_corank = { name: res.get() for name, res in umap_corank.items()}
# Coranking matrix
pool = mp.Pool(processes = min(len(cur_keys), mp.cpu_count()))
results= {}
for key in umap_res.keys():
Y = umap_res[key]['embedding']
results[key] = pool.apply_async(geo_rho, args = (X, Y, GX))
pool.close()
pool.join()
umap_spearman_mean = [np.mean(res.get()) for name, res in results.items()]
umap_spearman_var = [np.var(res.get()) for name, res in results.items()]
print("Mean and std of the average spearman rank correlation" + \
" for UMAP is: %f +/- %f"
%(np.mean(umap_spearman_mean), np.std(umap_spearman_mean)))
umap_corank_df = pd.DataFrame(umap_corank)
umap_corank_df['mean'] = umap_corank_df.mean(axis = 1)
umap_corank_df['sd'] = umap_corank_df.std(axis = 1)
umap_corank_df.head()
plt.errorbar(umap_corank_df.index, umap_corank_df['mean'], yerr=umap_corank_df['sd'])
diff_tsne_corank['perp20_tstep1_'].keys()
corank_df = pd.DataFrame(
{
'PCA':score_pca,
'PHATE':score_phate,
'ISOMAP':score_iso,
'UMAP_mean': umap_corank_df['mean'],
'UMAP_sd':umap_corank_df['sd'],
}
)
chosen_perp = [25, 50, 100, 250]
chosen_tstep = [5, 10, 20, 50]
for perp in chosen_perp:
if perp == 20:
df = pd.DataFrame(diff_tsne_corank['perp20_tstep1_'])
else:
df = pd.DataFrame(vanilla_tsne_corank['perp'+str(perp)])
corank_df['tSNE_perp' + str(perp) + '_mean'] = df.mean(axis = 1)
corank_df['tSNE_perp' + str(perp) + '_sd'] = df.std(axis = 1)
for tstep in chosen_tstep:
df = pd.DataFrame(diff_tsne_corank['perp25_tstep' + str(tstep) + "_"])
corank_df['diff_tsne_corank' + str(tstep) + '_mean'] = df.mean(axis = 1)
corank_df['diff_tsne_corank' + str(tstep) + '_sd'] = df.std(axis = 1)
df = pd.DataFrame(scaled_diff_tsne_corank['perp25_tstep' + str(tstep) + "_"])
corank_df['scaled_diff_tsne_corank' + str(tstep) + '_mean'] = df.mean(axis = 1)
corank_df['scaled_diff_tsne_corank' + str(tstep) + '_sd'] = df.std(axis = 1)
corank_df0 = corank_df
corank_df.head()
#corank_df = corank_df.iloc[0:50, :]
corank_df =corank_df0
df2plot = corank_df.iloc[::10, :]
tsne_cols = ["#A1D99B" , "#41AB5D" ,"#238B45" ,"#00441B"]
difftsne_colors = ["#FDAE6B", "#F16913", "#D94801", "#7F2704" ]
scaled_diff_tsne_colors = ["#BCBDDC", "#807DBA" ,"#54278F", "#3F007D"]
mpl.rcParams['xtick.labelsize'] = 30
mpl.rcParams['ytick.labelsize'] = 30
plt.figure(figsize = (12, 9))
plt.scatter(df2plot.index, df2plot['PCA'], marker = 'o', label ='', c = "#CB181D")
plt.scatter(df2plot.index, df2plot['PHATE'], marker='o', label = '', c="#E78AC3" )
plt.plot(corank_df.index, corank_df['PCA'], label ='PCA', c = "#CB181D", linewidth = 2)
plt.plot(corank_df.index, corank_df['PHATE'], label = 'PHATE', c="#E78AC3", linewidth = 2)
plt.scatter(df2plot.index, df2plot['ISOMAP'], marker='o', label = '', c="black" )
plt.plot(corank_df.index, corank_df['ISOMAP'], label = 'ISOMAP', c="black", linewidth = 2 )
leg = plt.legend(
fontsize=20, markerscale=1,
#loc='upper center', bbox_to_anchor=(1.3, 1.0),
shadow=True, ncol=1)
for line in leg.get_lines():
line.set_linewidth(4.0)
plt.xlabel(r"$K$", fontsize=24)
plt.ylabel(r"$Q_{NX}(K)$", fontsize=24)
mpl.rcParams['xtick.labelsize'] = 16
mpl.rcParams['ytick.labelsize'] = 16
df2plot = corank_df.iloc[::10, :]
tsne_cols = ["#A1D99B" , "#41AB5D" ,"#238B45" ,"#00441B"]
difftsne_colors = ["#FDAE6B", "#F16913", "#D94801", "#7F2704" ]
scaled_diff_tsne_colors = ["#BCBDDC", "#807DBA" ,"#54278F", "#3F007D"]
mpl.rcParams['xtick.labelsize'] = 30
mpl.rcParams['ytick.labelsize'] = 30
plt.figure(figsize = (12, 9))
# plt.scatter(df2plot.index, df2plot['PCA'], marker = 'o', label ='', c = "#CB181D")
# plt.scatter(df2plot.index, df2plot['PHATE'], marker='o', label = '', c="#E78AC3" )
# plt.plot(corank_df.index, corank_df['PCA'], label ='PCA', c = "#CB181D", linewidth = 2)
# plt.plot(corank_df.index, corank_df['PHATE'], label = 'PHATE', c="#E78AC3", linewidth = 2)
# plt.scatter(df2plot.index, df2plot['ISOMAP'], marker='o', label = '', c="black" )
# plt.plot(corank_df.index, corank_df['ISOMAP'], label = 'ISOMAP', c="black", linewidth = 2 )
plt.plot(corank_df.index, corank_df['UMAP_mean'],
linewidth = 2, label ='UMAP', c = "#6BAED6")
plt.errorbar(df2plot.index, df2plot['UMAP_mean'], yerr=df2plot['UMAP_sd'],
linewidth =0, elinewidth=2, label = '', c = "#6BAED6", capsize=3)
for i, perp in enumerate(chosen_perp):
plt.plot(corank_df.index, corank_df['tSNE_perp'+ str(perp) + '_mean'],
label =r'tSNE: $\eta=%d$' %perp, c = tsne_cols[i], linewidth=2)
plt.errorbar(df2plot.index, df2plot['tSNE_perp'+ str(perp) + '_mean'],
yerr=df2plot['tSNE_perp'+ str(perp) +'_sd'], capsize=3,
linewidth =0, elinewidth=2, label = '', c = tsne_cols[i])
for i, tstep in enumerate(chosen_tstep):
plt.plot(corank_df.index, corank_df['diff_tsne_corank'+ str(tstep) + '_mean'],
label =r'Diffusion tSNE: $\eta=25, t=%d$' %tstep, c = difftsne_colors[i],
linewidth=2)
plt.errorbar(df2plot.index, df2plot['diff_tsne_corank'+ str(tstep) + '_mean'],
yerr=df2plot['diff_tsne_corank'+ str(tstep) +'_sd'], capsize=3,
linewidth =0, elinewidth=2, label = '', c = difftsne_colors[i])
for i, tstep in enumerate(chosen_tstep):
plt.plot(corank_df.index, corank_df['scaled_diff_tsne_corank'+ str(tstep) + '_mean'],
label =r'Scaled diffusion tSNE: $\eta=25, t=%d$' %tstep,
c = scaled_diff_tsne_colors[i], linewidth=2)
plt.errorbar(df2plot.index, df2plot['scaled_diff_tsne_corank'+ str(tstep) + '_mean'],
yerr=df2plot['scaled_diff_tsne_corank'+ str(tstep) +'_sd'], capsize=3,
linewidth =0, elinewidth=2, label = '', c = scaled_diff_tsne_colors[i])
leg = plt.legend(
fontsize=20, markerscale=1,
loc='upper center', bbox_to_anchor=(1.3, 1.0),
shadow=True, ncol=1)
for line in leg.get_lines():
line.set_linewidth(4.0)
plt.xlabel(r"$K$", fontsize=24)
plt.ylabel(r"$Q_{NX}(K)$", fontsize=24)
plt.xlim(0, 50)
plt.ylim(0, 0.8)
mpl.rcParams['xtick.labelsize'] = 16
mpl.rcParams['ytick.labelsize'] = 16
df2plot = corank_df.iloc[::10, :]
tsne_cols = ["#A1D99B" , "#41AB5D" ,"#238B45" ,"#00441B"]
difftsne_colors = ["#FDAE6B", "#F16913", "#D94801", "#7F2704" ]
scaled_diff_tsne_colors = ["#BCBDDC", "#807DBA" ,"#54278F", "#3F007D"]
mpl.rcParams['xtick.labelsize'] = 30
mpl.rcParams['ytick.labelsize'] = 30
plt.figure(figsize = (12, 9))
plt.scatter(df2plot.index, df2plot['PCA'], marker = 'o', label ='', c = "#CB181D")
plt.scatter(df2plot.index, df2plot['PHATE'], marker='o', label = '', c="#E78AC3" )
plt.plot(corank_df.index, corank_df['PCA'], label ='PCA', c = "#CB181D", linewidth = 2)
plt.plot(corank_df.index, corank_df['PHATE'], label = 'PHATE', c="#E78AC3", linewidth = 2)
plt.scatter(df2plot.index, df2plot['ISOMAP'], marker='o', label = '', c="black" )
plt.plot(corank_df.index, corank_df['ISOMAP'], label = 'ISOMAP', c="black", linewidth = 2 )
plt.plot(corank_df.index, corank_df['UMAP_mean'],
linewidth = 2, label ='UMAP', c = "#6BAED6")
plt.errorbar(df2plot.index, df2plot['UMAP_mean'], yerr=df2plot['UMAP_sd'],
linewidth =0, elinewidth=2, label = '', c = "#6BAED6", capsize=3)
for i, perp in enumerate(chosen_perp):
plt.plot(corank_df.index, corank_df['tSNE_perp'+ str(perp) + '_mean'],
label =r'tSNE: $\eta=%d$' %perp, c = tsne_cols[i], linewidth=2)
plt.errorbar(df2plot.index, df2plot['tSNE_perp'+ str(perp) + '_mean'],
yerr=df2plot['tSNE_perp'+ str(perp) +'_sd'], capsize=3,
linewidth =0, elinewidth=2, label = '', c = tsne_cols[i])
for i, tstep in enumerate(chosen_tstep):
plt.plot(corank_df.index, corank_df['diff_tsne_corank'+ str(tstep) + '_mean'],
label =r'Diffusion tSNE: $\eta=25, t=%d$' %tstep, c = difftsne_colors[i],
linewidth=2)
plt.errorbar(df2plot.index, df2plot['diff_tsne_corank'+ str(tstep) + '_mean'],
yerr=df2plot['diff_tsne_corank'+ str(tstep) +'_sd'], capsize=3,
linewidth =0, elinewidth=2, label = '', c = difftsne_colors[i])
for i, tstep in enumerate(chosen_tstep):
plt.plot(corank_df.index, corank_df['scaled_diff_tsne_corank'+ str(tstep) + '_mean'],
label =r'Scaled diffusion tSNE: $\eta=25, t=%d$' %tstep,
c = scaled_diff_tsne_colors[i], linewidth=2)
plt.errorbar(df2plot.index, df2plot['scaled_diff_tsne_corank'+ str(tstep) + '_mean'],
yerr=df2plot['scaled_diff_tsne_corank'+ str(tstep) +'_sd'], capsize=3,
linewidth =0, elinewidth=2, label = '', c = scaled_diff_tsne_colors[i])
leg = plt.legend(
fontsize=20, markerscale=1,
loc='upper center', bbox_to_anchor=(1.3, 1.0),
shadow=True, ncol=1)
for line in leg.get_lines():
line.set_linewidth(4.0)
plt.xlabel(r"$K$", fontsize=24)
plt.ylabel(r"$Q_{NX}(K)$", fontsize=24)
mpl.rcParams['xtick.labelsize'] = 16
mpl.rcParams['ytick.labelsize'] = 16
for perp in [10, 20, 25, 50, 100, 250, 300, 500]:
if perp == 20:
rho = diff_tsne_spearman_mean['perp20_tstep1_']
else:
rho = vanilla_tsne_spearman_mean['perp'+ str(perp)]
print("Mean and std of the average spearman rank correlation" + \
" for vanilla t-SNE at perp %d is: %.3f \; (%.4f)"
%(perp, np.mean(rho), np.std(rho)))
perp = 10
for tstep in tsteps:
rho = diff_tsne_spearman_mean['perp'+ str(perp) + "_tstep" + str(tstep) + "_"]
print("Mean and std of the avg. spearman rank correlation" + \
" for Diffusion t-SNE at perp %d and t-step %dis: %.3f (%.4f)"
%(perp, tstep, np.mean(rho), np.std(rho)))
perp = 20
for tstep in tsteps:
rho = diff_tsne_spearman_mean['perp'+ str(perp) + "_tstep" + str(tstep) + "_"]
print("Mean and std of the avg. spearman rank correlation" + \
" for Diffusion t-SNE at perp %d and t-step %dis: %.3f \; (%.4f)"
%(perp, tstep, np.mean(rho), np.std(rho)))
perp = 25
for tstep in tsteps:
rho = diff_tsne_spearman_mean['perp'+ str(perp) + "_tstep" + str(tstep) + "_"]
print("Mean and std of the avg. spearman rank correlation" + \
" for Diffusion t-SNE at perp %d and t-step %dis: %.4f\; (%.4f)"
%(perp, tstep, np.mean(rho), np.std(rho)))
perp = 30
for tstep in tsteps:
rho = diff_tsne_spearman_mean['perp'+ str(perp) + "_tstep" + str(tstep) + "_"]
print("Mean and std of the avg. spearman rank correlation" + \
" for Diffusion t-SNE at perp %d and t-step %dis: %.3f \; (%.4f)"
%(perp, tstep, np.mean(rho), np.std(rho)))
perp = 50
for tstep in tsteps:
rho = diff_tsne_spearman_mean['perp'+ str(perp) + "_tstep" + str(tstep) + "_"]
print("Mean and std of the avg. spearman rank correlation" + \
" for Diffusion t-SNE at perp %d and t-step %dis: %.3f \; (%.4f)"
%(perp, tstep, np.mean(rho), np.std(rho)))
perp = 10
for tstep in tsteps:
rho = scaled_diff_tsne_spearman_mean['perp'+ str(perp) + "_tstep" + str(tstep) + "_"]
print("Mean and std of the avg. spearman rank correlation" + \
" for Diffusion t-SNE at perp %d and t-step %dis: %.3f \; (%.4f)"
%(perp, tstep, np.mean(rho), np.std(rho)))
perp = 20
for tstep in tsteps:
rho = scaled_diff_tsne_spearman_mean['perp'+ str(perp) + "_tstep" + str(tstep) + "_"]
print("Mean and std of the avg. spearman rank correlation" + \
" for Diffusion t-SNE at perp %d and t-step %dis: %.3f \; (%.4f)"
%(perp, tstep, np.mean(rho), np.std(rho)))
perp = 25
for tstep in tsteps:
rho = scaled_diff_tsne_spearman_mean['perp'+ str(perp) + "_tstep" + str(tstep) + "_"]
print("Mean and std of the avg. spearman rank correlation" + \
" for Diffusion t-SNE at perp %d and t-step %dis: %.3f \; (%.4f)"
%(perp, tstep, np.mean(rho), np.std(rho)))
perp = 30
for tstep in tsteps:
rho = scaled_diff_tsne_spearman_mean['perp'+ str(perp) + "_tstep" + str(tstep) + "_"]
print("Mean and std of the avg. spearman rank correlation" + \
" for Diffusion t-SNE at perp %d and t-step %dis: %.3f \; (%.4f)"
%(perp, tstep, np.mean(rho), np.std(rho)))
perp = 50
for tstep in tsteps:
rho = scaled_diff_tsne_spearman_mean['perp'+ str(perp) + "_tstep" + str(tstep) + "_"]
print("Mean and std of the avg. spearman rank correlation" + \
" for Diffusion t-SNE at perp %d and t-step %dis: %.3f \; (%.4f)"
%(perp, tstep, np.mean(rho), np.std(rho)))